*** How worried should we be? The implications of fabricated survey data for political science
*** Table A13: Demographic composition of clean and fake data in Peru

set more off

* set directory to location of dataset in following line
cd "C:\~\Downloads\"

use "QAC_2017_12.21.17.dta", clear

keep if pais == 11

gen clean_data = 1 if cancelled == 0
replace clean_data = 0 if cancelled == 1
replace clean_data = 0 if ls3 == 999999
replace clean_data = 0 if fraud == 1
replace fraud = 0 if ls3 == 999999

** gen age in years
gen birth_yr = q2
replace birth_yr = . if inlist(birth_yr, 888888, 988888, 999999)

gen svy_date = date(date, "MDYhm")
format svy_date %td
gen year = year(svy_date)

gen upload_time = Clock(upload, "MDYhm")
format upload_time %tC
gen upload_hour = hhC(upload_time)

gen age_yrs = birth_yr
gen edad = 1 if age_yrs >= 18 & age_yrs <=25
replace edad = 2 if age_yrs >= 26 & age_yrs <=35
replace edad = 3 if age_yrs >= 36 & age_yrs <=45
replace edad = 4 if age_yrs >= 46 & age_yrs <=55
replace edad = 5 if age_yrs >= 56 & age_yrs <=65
replace edad = 6 if age_yrs >= 66 
label var edad "Age Cohort"
label define agegroup 1 "18-25 years old" 2 "26-35 years old" 3 "36-45 years old" 4 "46-55 years old" 5 "56-65 years old" 6 "66+ years old" 
label values edad agegroup

* education level
gen ed_level = 1 if inlist(ed, 0, 1, 2, 3, 4, 5, 6)
replace ed_level = 2 if inlist(ed, 7, 8, 9, 10, 11)
replace ed_level = 3 if inlist(ed, 12, 13, 14, 15, 16, 17, 18)
label var ed_level "Education Level"
label define edlevel 1 "None or Primary Ed." 2 "Secondary Ed." 3 "University+ Ed."
label values ed_level edlevel
label var ed "Years of Education"
replace ed = . if inlist(ed, 888888, 988888, 999999)
* relabel gender variable
lab define q1 1 "Male" 2 "Female"
lab values q1 q1

** matching
 set seed 339487731
* first flag cases that were canceled but not labeled as fraudulent
gen canceled_nonfraud = 1 if status == "Canceled" & fraud == 0
replace canceled_nonfraud = 0 if canceled_nonfraud == .
drop if canceled_nonfraud == 1

cem upm1a (#0) q1 (#0) agequota (#0) if canceled_nonfraud == 0, tr(fraud) k2k

** label data for full sample comparisons
* first drop canceled cases that were not likelyfrauds and matched to clean cases
drop if clean_data == 0 & cem_matched != 1
* 1 = fake, 2 = clean matched, 3 = rest of data
gen comparison_groups = 1 if fraud == 1 & cem_matched == 1
replace comparison_groups = 2 if fraud == 0 & cem_matched == 1
replace comparison_groups = 3 if fraud == 0 & cem_matched == 0

** Table A13 values
* fraud
tab edad q1 if clean_data == 1, cell
* clean
tab edad q1 if clean_data == 0, cell
